{
 "metadata": {
  "name": "",
  "signature": "sha256:33b58f63005d8bb0bc3878707d8ea8a04902b6379eca42e5e6c26680a530c487"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Data loading, storage, and file formats"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from __future__ import division\n",
      "from numpy.random import randn\n",
      "import numpy as np\n",
      "import os\n",
      "import matplotlib.pyplot as plt\n",
      "np.random.seed(12345)\n",
      "plt.rc('figure', figsize=(10, 6))\n",
      "from pandas import Series, DataFrame\n",
      "import pandas as pd\n",
      "np.set_printoptions(precision=4)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%pwd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%cd ../book_scripts"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Reading and Writing Data in Text Format"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat ch06/ex1.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = pd.read_csv('ch06/ex1.csv')\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pd.read_table('ch06/ex1.csv', sep=',')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat ch06/ex2.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pd.read_csv('ch06/ex2.csv', header=None)\n",
      "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "names = ['a', 'b', 'c', 'd', 'message']\n",
      "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat ch06/csv_mindex.csv\n",
      "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])\n",
      "parsed"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "list(open('ch06/ex3.txt'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "result = pd.read_table('ch06/ex3.txt', sep='\\s+')\n",
      "result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat ch06/ex4.csv\n",
      "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat ch06/ex5.csv\n",
      "result = pd.read_csv('ch06/ex5.csv')\n",
      "result\n",
      "pd.isnull(result)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n",
      "result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n",
      "pd.read_csv('ch06/ex5.csv', na_values=sentinels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Reading text files in pieces"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "result = pd.read_csv('ch06/ex6.csv')\n",
      "result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pd.read_csv('ch06/ex6.csv', nrows=5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n",
      "chunker"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n",
      "\n",
      "tot = Series([])\n",
      "for piece in chunker:\n",
      "    tot = tot.add(piece['key'].value_counts(), fill_value=0)\n",
      "\n",
      "tot = tot.order(ascending=False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "tot[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Writing data out to text format"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = pd.read_csv('ch06/ex5.csv')\n",
      "data"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.to_csv('ch06/out.csv')\n",
      "!cat ch06/out.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.to_csv(sys.stdout, sep='|')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.to_csv(sys.stdout, na_rep='NULL')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.to_csv(sys.stdout, index=False, header=False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dates = pd.date_range('1/1/2000', periods=7)\n",
      "ts = Series(np.arange(7), index=dates)\n",
      "ts.to_csv('ch06/tseries.csv')\n",
      "!cat ch06/tseries.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Series.from_csv('ch06/tseries.csv', parse_dates=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Manually working with delimited formats"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat ch06/ex7.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import csv\n",
      "f = open('ch06/ex7.csv')\n",
      "\n",
      "reader = csv.reader(f)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for line in reader:\n",
      "    print(line)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "lines = list(csv.reader(open('ch06/ex7.csv')))\n",
      "header, values = lines[0], lines[1:]\n",
      "data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
      "data_dict"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "class my_dialect(csv.Dialect):\n",
      "    lineterminator = '\\n'\n",
      "    delimiter = ';'\n",
      "    quotechar = '\"'\n",
      "    quoting = csv.QUOTE_MINIMAL"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open('mydata.csv', 'w') as f:\n",
      "    writer = csv.writer(f, dialect=my_dialect)\n",
      "    writer.writerow(('one', 'two', 'three'))\n",
      "    writer.writerow(('1', '2', '3'))\n",
      "    writer.writerow(('4', '5', '6'))\n",
      "    writer.writerow(('7', '8', '9'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%cat mydata.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "JSON data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "obj = \"\"\"\n",
      "{\"name\": \"Wes\",\n",
      " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n",
      " \"pet\": null,\n",
      " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n",
      "              {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n",
      "}\n",
      "\"\"\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import json\n",
      "result = json.loads(obj)\n",
      "result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "asjson = json.dumps(result)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "siblings = DataFrame(result['siblings'], columns=['name', 'age'])\n",
      "siblings"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "XML and HTML, Web scraping"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from lxml.html import parse\n",
      "from urllib2 import urlopen\n",
      "\n",
      "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n",
      "\n",
      "doc = parsed.getroot()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "links = doc.findall('.//a')\n",
      "links[15:20]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "lnk = links[28]\n",
      "lnk\n",
      "lnk.get('href')\n",
      "lnk.text_content()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "urls = [lnk.get('href') for lnk in doc.findall('.//a')]\n",
      "urls[-10:]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "tables = doc.findall('.//table')\n",
      "calls = tables[9]\n",
      "puts = tables[13]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rows = calls.findall('.//tr')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def _unpack(row, kind='td'):\n",
      "    elts = row.findall('.//%s' % kind)\n",
      "    return [val.text_content() for val in elts]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "_unpack(rows[0], kind='th')\n",
      "_unpack(rows[1], kind='td')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from pandas.io.parsers import TextParser\n",
      "\n",
      "def parse_options_data(table):\n",
      "    rows = table.findall('.//tr')\n",
      "    header = _unpack(rows[0], kind='th')\n",
      "    data = [_unpack(r) for r in rows[1:]]\n",
      "    return TextParser(data, names=header).get_chunk()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "call_data = parse_options_data(calls)\n",
      "put_data = parse_options_data(puts)\n",
      "call_data[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Parsing XML with lxml.objectify"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%cd mta_perf/Performance_XML_Data"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!head -21 Performance_MNR.xml"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from lxml import objectify\n",
      "\n",
      "path = 'Performance_MNR.xml'\n",
      "parsed = objectify.parse(open(path))\n",
      "root = parsed.getroot()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = []\n",
      "\n",
      "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n",
      "               'DESIRED_CHANGE', 'DECIMAL_PLACES']\n",
      "\n",
      "for elt in root.INDICATOR:\n",
      "    el_data = {}\n",
      "    for child in elt.getchildren():\n",
      "        if child.tag in skip_fields:\n",
      "            continue\n",
      "        el_data[child.tag] = child.pyval\n",
      "    data.append(el_data)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "perf = DataFrame(data)\n",
      "perf"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "root"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "root.get('href')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "root.text"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Binary data formats"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cd ../.."
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "frame = pd.read_csv('ch06/ex1.csv')\n",
      "frame\n",
      "frame.to_pickle('ch06/frame_pickle')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pd.read_pickle('ch06/frame_pickle')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Using HDF5 format"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "store = pd.HDFStore('mydata.h5')\n",
      "store['obj1'] = frame\n",
      "store['obj1_col'] = frame['a']\n",
      "store"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "store['obj1']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "store.close()\n",
      "os.remove('mydata.h5')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Interacting with HTML and Web APIs"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import requests\n",
      "url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'\n",
      "resp = requests.get(url)\n",
      "resp"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data[:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "issue_labels = DataFrame(data)\n",
      "issue_labels"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Interacting with databases"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import sqlite3\n",
      "\n",
      "query = \"\"\"\n",
      "CREATE TABLE test\n",
      "(a VARCHAR(20), b VARCHAR(20),\n",
      " c REAL,        d INTEGER\n",
      ");\"\"\"\n",
      "\n",
      "con = sqlite3.connect(':memory:')\n",
      "con.execute(query)\n",
      "con.commit()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = [('Atlanta', 'Georgia', 1.25, 6),\n",
      "        ('Tallahassee', 'Florida', 2.6, 3),\n",
      "        ('Sacramento', 'California', 1.7, 5)]\n",
      "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n",
      "\n",
      "con.executemany(stmt, data)\n",
      "con.commit()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cursor = con.execute('select * from test')\n",
      "rows = cursor.fetchall()\n",
      "rows"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cursor.description"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "DataFrame(rows, columns=zip(*cursor.description)[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas.io.sql as sql\n",
      "sql.read_sql('select * from test', con)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}